home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Cream of the Crop 22
/
Cream of the Crop 22.iso
/
os2
/
htm2txt1.zip
/
HTM2TXT.CMD
< prev
next >
Wrap
OS/2 REXX Batch file
|
1996-09-21
|
27KB
|
646 lines
/* ----------------------------------------------------------------- */
lastmod='1996-09-21'
/* 96-09-21 or rework follow href= error */
/* 96-09-11 or <a ...> tag due to etraas@te.xs4all.nl */
/* 96-08-21 or rework <table> tag error */
/* 96-08-03 or rework <table> tag due to "Kirchner Soft" */
/* 96-08-02 or rework <table> tag due to pinkas@en.com */
/* 96-07-16 or follow href= */
/* 96-06-29 or rework */
/* 96-04-17 or try to support <table> */
/* 96-01-15 or reworked */
/* 95-07-10 or decode HTML files */
/* --------------------------------------------------------- */
/* call: htm2txt infile [l nn [p nn [e editor [u */
/* output: infile-name.TXT */
/* */
/* recognised tags: */
/* */
/* all tags as supported by ibm webex 1.1b */
/* */
/* special tags: */
/* */
/* <trace> trace '?i' */
/* <exit> exit immediately */
/* */
/* recognised substitute variables see variable 'consts' */
/* tab-char ' ' will be ignored */
/* */
/* notes: */
/* */
/* all tags are converted as 'best fit'. */
/* the image a browser produces will not be met. */
/* */
/* --------------------------------------------------------- */
/* variables to be customized */
/* */
linemax=72 /* maximum line length */
pixlbyt= 6 /* pixels per byte for tables */
editor ='e' /* editor for output file */
/* --------------------------------------------------------- */
/* constants as known of today */
/* */
consts= "&space '20'x"
consts=consts " '20'x"
consts=consts "ß ß"
consts=consts "ä ä"
consts=consts "Ä Ä"
consts=consts "ö ö"
consts=consts "Ö Ö"
consts=consts "ü ü"
consts=consts "Ü Ü"
consts=consts "á á"
consts=consts "é é"
consts=consts "í í"
consts=consts " '20'x"
consts=consts ": :"
consts=consts "< <"
consts=consts "> >"
consts=consts "[ ["
consts=consts "] ]"
consts=consts "’ '" /* Æ */
consts=consts '" "'
consts=consts '‘ "' /* æ */
consts=consts '“ "' /* ô */
consts=consts '” "' /* ö */
consts=consts " 'a0'x"
consts=consts "° °" /* ° EBCDIC !!! */
consts=consts "¹ '" /* ╣ */
consts=consts "&mdash -"
consts=consts "< <"
consts=consts "> >"
consts=consts "& '00'x"
consts=consts "© '20'x"
/* --------------------------------------------------------- */
/* check input parameters */
/* */
parse upper arg ifiname options
if ifiname='' then exit 4
parse var ifiname fn '.' ext
if ext='' then ifiname=ifiname'.HTM'
ofiname = fn'.TXT'
/* --------------------------------------------------------- */
/* check for options */
/* */
swi_url=0
do while options \= ''
parse upper var options opt val options
select
when opt='L' then linemax=val
when opt='P' then pixlbyt=val
when opt='E' then editor =val
when opt='U' then do
swi_url=1
options=val options
end
otherwise nop
end
end
/* --------------------------------------------------------- */
/* activate debug facilities */
/* */
/*
signal on syntax
signal on error
signal on failure
signal on halt
*/
/* --------------------------------------------------------- */
/* some global controls */
/* */
hrf.0=1 /* href-control */
hrf.1=ifiname
'erase' ofiname '2>NUL'
call lineout ofiname,'HTM2TXT v.' lastmod
call lineout ofiname,' '
call lineout ofiname,'Extracted from' ifiname',' date()',' left(time(),5)
call lineout ofiname,' '
do nexthrf=1 while hrf.0>=nexthrf
call process_file hrf.nexthrf
end
call lineout ofiname
say
if editor \= '' then 'start /F' editor ofiname /* <=== edit result */
/* --------------------------------------------------------- */
exit 0
/* --------------------------------------------------------- */
/* process a file */
/* */
process_file: parse arg ifiname
/* --------------------------------------------------------- */
/* read infile */
/* */
nl ='0d'x /* new line character */
ifi=''
say
say 'reading' ifiname
call stream ifiname,'c','close'
do i=1 while chars(ifiname)>1
l=linein(ifiname)||nl
l=translate(l,' ','09'x)
ifi = ifi||l
end
call stream ifiname,'c','close'
say i-1 'records read from' ifiname
/* --------------------------------------------------------- */
/* format outfile lines */
/* */
ofi.0=0 /* out file controls */
dlspaces ='' /* <DL>-spaces */
lispaces ='' /* <LI>-spaces */
indents =0 /* number of indents */
blanklines=0 /* number of blank lines */
linelen =linemax /* max. linelength */
outtext ='' /* initial text */
/* switches: */
swi_pre = 0 /* switch PRE */
swi_tbl = 0 /* switch table definition */
swi_lst = 0 /* switch list definition */
swi_cnt = 0 /* switch center text */
swi_cat = 0 /* switch concatenate */
swi_trc = 0 /* switch trace */
/* --------------------------------------------------------- */
/* scan input stream */
/* */
call charout ,'processing token '
text=''
do count=1 while length(ifi)>0
call charout ,format(count,5) copies('08'x,6)
if swi_trc then trace 'i'
/* check next line */
parse var ifi parttext '<' tag '>' ifi
/* process text */
select
when swi_pre then call process_preformatted
when strip(parttext)=nl then nop
otherwise do
do while pos(nl,parttext)>0
parse var parttext a (nl) b
parttext=strip(a) strip(b)
end
if swi_cat then text=text||parttext
else do
if text='' then text= parttext
else text=text parttext
end
end
end
/* process tag */
tag=translate(tag,' ',nl)
if left(tag,1)='!' then tag='!' substr(tag,2)
parse var tag tag options
tag=translate(tag)
swi_cat=0
select
when tag='TRACE' then swi_trc=1
when tag='EXIT' then signal finish
when tag='!' then call out '***' options '***'
when tag='FONT' then swi_cat=1
when tag='UL',
| tag='OL',
| tag='DL',
| tag='DIR',
| tag='MENU',
then do
call out text
call out ' '
lispaces=' * '
indents=indents+1
swi_lst=1
end
when tag='LI' then call out text
when tag='DT' then do
call out text
lispaces=' * '
if indents>0 then indents=indents-1
end
when tag='DD' then do
call out text
lispaces=' '
indents=indents+1
end
when tag='/UL',
| tag='/OL',
| tag='/DL',
| tag='/DIR',
| tag='/MENU',
then do
call out text
lispaces=''
if indents>0 then indents=indents-1
call out ' '
swi_lst=0
end
when tag='CENTER',
| tag='CENTRE',
then swi_cnt=1
when tag='/CENTER',
| tag='/CENTRE',
then do
swi_cnt=0
call out text
end
when tag='P',
| tag='/TITLE',
| tag='/CENTER',
| tag='/CENTRE',
then call out text
when tag='/HEAD',
then do
call out text
call out ' '
end
when tag='PRE' then do
swi_pre=1
linelen=parmval('WIDTH',options)
end
when tag='/PRE' then do
swi_pre=0
linelen=linemax
end
when tag='HR' then call out copies('-',linelen)
when tag='H1',
| tag='H2',
| tag='H3',
| tag='H4',
| tag='/H1',
| tag='/H2',
| tag='/H3',
| tag='/H4',
| tag='/CAPTION',
then do
call out text
call out ' '
end
when tag='A' then do
parse upper var options 'HREF' . '"' hrefid '"'
nogo= pos('#',hrefid)>0
srefid=''
if swi_url,
& \nogo then do
srefid=hrefid
end
parse var hrefid z '.' fext
nogo=nogo|(left(fext,3)\='HTM')
parse var hrefid z 'FILE:' hrefid
if hrefid='' then hrefid=z
nogo=nogo|(strip(hrefid)='')
do i=1 to hrf.0
if hrf.i=hrefid then leave
end
if (i>hrf.0)&(\nogo) then do
hrf.0=hrf.0+1; z=hrf.0; hrf.z=hrefid
end
end
when tag='/A' then do
if swi_url,
& srefid\='' then do
text=text '('srefid')'
srefid=''
end
end
/*
when tag='IMG' then do
z=parmval('ALT',options)
if z\=0 then do
if swi_tbl then do
text=z
call save_table_text
end
else text=text z
end
end
*/
when tag='TABLE' then do
call out text
call out ' '
swi_tbl=1
swi_wid=1
tbwid. =0
end
when tag='TR' then do
tbcol=0
tbmax=0
drop tbtxt.
end
when tag='TD' then do
/* determine next column */
z=parmval('COLSTART',options)
if z=0 then tbcol=tbcol+1
else tbcol=z
if tbmax<tbcol then tbmax=tbcol
/* check for width= tag */
p=parmval('WIDTH',options)
if p>0 then do
select
when right(p,3)='PIX' then do
parse var p n 'PIX' .
tbwid.tbcol=n%pixlbyt
end
when right(p,1)='%' then do
parse var p n '%' .
tbwid.tbcol=(n*linelen)%100
end
otherwise
if p>linemax then p=linemax
tbwid.tbcol=p
end
end
/* set lines/col to 0 */
tblin.tbcol=0
end
when tag='/TD' then do
if swi_tbl then call save_table_text
end
when tag='/TR' then do
if swi_tbl then do
/* col-width already done ? */
if swi_wid then do
swi_wid=0
/* check predefined col-width */
colwi=0
do i=1 to tbmax
colwi=colwi+tbwid.i
end
linelen=linemax-colwi
if linelen<=0 then linelen=linemax
/* set col-width if not set */
do i=1 to tbmax
if tbwid.i>0 then iterate
tbwid.i=linelen%tbmax
end
linelen=linemax
/* check sum colwid exceeds */
sum_col=0
do i=1 to tbmax
sum_col=sum_col+tbwid.i
end
if sum_col>linemax then do
ratio=linemax/sum_col
do i=1 to tbmax
tbwid.i=trunc(tbwid.i/ratio)
end
end
end
/* get max nr. lines in row */
lnmax=1
do i=1 to tbmax
if lnmax<tblin.i then lnmax=tblin.i
end
/* fill uninitlzd variables */
do y=1 to lnmax
do k=1 to tbmax
tbtxt.k.y=subs(tbtxt.k.y)
if left(tbtxt.k.y,6)\='TBTXT.' then iterate
if k=1 then tbtxt.k.y='_'
else tbtxt.k.y=''
end
end
/* scan all lines all cols */
do y=1 to lnmax
anytxt=0
do k=1 to tbmax
if strip(tbtxt.k.y)='' then iterate
anytxt=1
leave
end
do while anytxt
anytxt=0
do k=1 to tbmax
/* check length fits */
if length(tbtxt.k.y)>tbwid.k ,
& tbwid.k>0 then do
z=lastpos(' ',tbtxt.k.y,tbwid.k)
if z=0 then do /* give up */
otext=tbtxt.k.y
tbtxt.k.y=''
end
else do /* split text */
otext=left(tbtxt.k.y,z)
tbtxt.k.y=substr(tbtxt.k.y,z)
anytxt=1
end
end
else do
otext=tbtxt.k.y
tbtxt.k.y=''
end
tbtxt.1.y='_'
/* build output line */
text=text left(otext,tbwid.k)
end
/* all cols processed */
call out_table_text
end
end
end
end
when tag='/TABLE' then do
blanklines=0
call out ' '
swi_tbl=0
end
when tag='BR' then do
if swi_lst then call out text
if swi_tbl ,
& (tbmax>1) then call save_table_text
else call out text
end
otherwise nop
end
/* all finished */
end
/* --------------------------------------------------------- */
/* write outfile */
/* */
finish:
say
do i=1 to ofi.0
call lineout ofiname,ofi.i
end
/* --------------------------------------------------------- */
return
/* ========================================================= */
/* --------------------------------------------------------- */
/* save table-text */
/* */
save_table_text:
if strip(text)='' then return
tblin.tbcol=tblin.tbcol+1
z=tblin.tbcol
tbtxt.tbcol.z=text
text=''
return
/* --------------------------------------------------------- */
/* out table-text */
/* */
out_table_text:
text = strip(text)
if text ='' then return
if text \= '_' then call o text
text = ''
return
/* --------------------------------------------------------- */
/* process preformatted */
/* */
process_preformatted:
do while length(parttext)>0
parse var parttext outtext (nl) parttext
call out outtext
end
return
/* --------------------------------------------------------- */
/* extract parameter values */
/* */
parmval: procedure; parse upper arg key,string
z=pos(key,string)
if z=0 then return 0
string=substr(string,z)
parse var string '=' val .
val=translate(val,' ','"')
val=translate(strip(val))
return val
/* --------------------------------------------------------- */
/* do output lines */
/* */
out:
oli=subs(arg(1))
oll=length(oli)
/* do not output more than 1 blank line */
if oll=0 then do
if blanklines>0 then return
blanklines=blanklines+1
end
if linelen>0 then do
do while oll>linelen
z=lastpos(' ',oli,linelen)
if z=0 then z=oll
if (z>0) then do
call o left(oli,z)
oli=strip(substr(oli,z+1))
oll=length(oli)
end
end
end
call o oli
if oll>0 then blanklines=0
text=''
return
o: procedure expose swi_cnt linelen indents dlspaces lispaces ofi.
parse arg ooo
if swi_cnt then do
z=(linelen-length(ooo))%2
if z>0 then prefix=copies(' ',z)
else prefix=''
end
else do
prefix=copies(' ',indents)||lispaces||dlspaces
end
ofi.0=ofi.0+1; z=ofi.0; ofi.z=prefix||ooo
return
/* --------------------------------------------------------- */
/* substitute constants */
/* */
subs: procedure expose consts;
l = arg(1)
/* check for tab chars */
l=translate(l,' ','09'x)
/* check for variables */
z=pos('&',l)
if z=0 then return strip(l)
do while z > 0
head = left(l,z-1)
token = substr(l,z)
do i=1 to words(consts) by 2
a=word(consts,i)
b=length(a)
c=left(token,b)
d=word(consts,i+1)
if right(d,2)="'x" then interpret "d="d
if c=a then do
head=head||d
token=substr(token,b+2)
leave
end
end
if i>words(consts) then do
token='?'substr(token,2)
end
l = head||token
z=pos('&',l)
end
return strip(translate(l,'&','00'x))
/* --------------------------------------------------------- */
syntax:
say 'signal on syntax in' sigl':' strip(sourceline(sigl))
signal common_error
error:
say 'signal on error in' sigl':' strip(sourceline(sigl))
signal common_error
failure:
say 'signal on failure in' sigl':' strip(sourceline(sigl))
signal common_error
halt:
say 'signal on halt in' sigl':' strip(sourceline(sigl))
signal common_error
common_error:
trace '?i'
do forever
nop
end
/* --------------------------------------------------------- */